In [6]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick
import os
sns.set(style="whitegrid", color_codes=True)
import string
we need some books for the program to read and analyse, a good source for books would be http://www.gutenberg.org/
In [32]:
#this code is loading the book(in this case boy.txt) and getting the title
with open('boy.txt', encoding='utf-8') as book:
for i, line in enumerate(book):
#this is getting the title from the text
if line.startswith('Title:'):
title = line[6:]
print("The title is", title)
In [34]:
with open('boy.txt', encoding='utf-8') as book:
for i, line in enumerate(book):
if line.startswith('Title:'):
title = line[6:]
print("The title is", title)
if line.startswith('*** START OF THIS PROJECT'):
break
letter_frequencies = {}
for letter in string.ascii_uppercase:
letter_frequencies[letter] = 0
for i, line in enumerate(book):
#This starts reading^
'''this code is putting everything to uppercase so the
'reader program' can get every single letter'''
for char in line:
if char in string.ascii_letters:
uppercase = char.upper()
letter_frequencies[uppercase] += 1
print(letter_frequencies)
In [ ]:
if line.startswith('*** END OF THIS PROJECT'):
break
In [27]:
total = sum(letter_frequencies.values())
letter_percentages = {}
for letter, count in letter_frequencies.items():
letter_percentages[letter] = count/total * 100
data = {
'letters': list(letter_percentages.keys()),
'percentages': list(letter_percentages.values()),
}
In [12]:
plt.figure()
plot = sns.barplot(x='letters',y='percentages', data=data, palette='rainbow_r')
plot.set_title("{0}\n total {1} characters".format(title, total))
plot.set_xlabel('Letters')
plot.set_ylabel('Percentages')
fmt = '%.0f%%'
yticks = mtick.FormatStrFormatter(fmt)
plot.yaxis.set_major_formatter(yticks)
# print('The total number of letters is', total)
In [9]:
def word_analysis(filename):
title = ''
with open(filename, encoding='utf-8') as book:
#print(book.read()[:1000])
for i, line in enumerate(book):
if line.startswith('Title:'):
title = line[6:]
if line.startswith('*** START OF THIS PROJECT'):
break
#set up letter counter v
letter_frequencies = {}
for letter in string.ascii_uppercase:
letter_frequencies[letter] = 0
for i, line in enumerate(book):
#This starts reading^
for char in line:
if char in string.ascii_letters:
uppercase = char.upper()
letter_frequencies[uppercase] += 1
if line.startswith('*** END OF THIS PROJECT'):
break
# print (letter_frequencies)
total = sum(letter_frequencies.values())
letter_percentages = {}
for letter, count in letter_frequencies.items():
letter_percentages[letter] = count/total * 100
data = {
'letters': list(letter_percentages.keys()),
'percentages': list(letter_percentages.values()),
}
plt.figure()
plot = sns.barplot(x='letters',y='percentages', data=data, palette='rainbow_r')
plot.set_title("{0}\n total {1} characters".format(title, total))
plot.set_xlabel('Letters')
plot.set_ylabel('Percentages')
fmt = '%.0f%%'
yticks = mtick.FormatStrFormatter(fmt)
plot.yaxis.set_major_formatter(yticks)
# print('The total number of letters is', total)
# print(list(os.walk('.')))
In [11]:
for root, dirs, files in os.walk('.'):
for f in files:
if f.endswith('.txt'):
word_analysis(f)
# Stop walking sub directories
break
In [ ]: